library(tidyverse)
library(ggthemes)
library(readxl)


#Import data
df = read_excel('data/Transitional Justice Literature.xlsx')
df = df[2:nrow(df),]
df$Language %>% table
df = df %>% filter(Language == "English")

# Countries of authors ----------------------------------------------------

#Find number of studies in countries that only showed up once in data
others = df %>% 
  rename(countries = `Countries of authors' universities`) %>% 
  select(countries) %>% 
  separate(countries, into = c("country1", "country2", "country3",
                               "country4", "country5", "country6"), sep = ";") %>% 
  pivot_longer(cols = c("country1", "country2", "country3", "country4", "country5", "country6")) %>% 
  select(value) %>% 
  filter(complete.cases(value)) %>% 
  mutate(value = value %>% str_remove("\\(but see note\\)"),
         value = value %>% str_remove("^The "),
         value = str_trim(value, side = "both"),
         value = recode(value, "England" = "United Kingdom",
                        "Northern Ireland" = "United Kingdom",
                        "Netherlands" = "The Netherlands"
                        )) %>% 
  group_by(value) %>% 
  summarise(n = n()) %>% 
  filter(n == 1) %>% 
  ungroup() %>% summarise(n = sum(n)) %>% pull(n)

#Plot number of studies by country of author. Add a column for other countries from the code above
df %>% 
  rename(countries = `Countries of authors' universities`) %>% 
  select(countries) %>% 
  separate(countries, into = c("country1", "country2", "country3",
                               "country4", "country5", "country6"), sep = ";") %>% 
  pivot_longer(cols = c("country1", "country2", "country3", "country4", "country5", "country6")) %>% 
  select(value) %>% 
  filter(complete.cases(value)) %>% 
  mutate(value = value %>% str_remove("\\(but see note\\)"),
         value = value %>% str_remove("^The "),
         value = str_trim(value, side = "both"),
         value = recode(value, "England" = "United Kingdom",
                        "Northern Ireland" = "United Kingdom",
                        "Ireland" = "United Kingdom"
                        )) %>% 
  group_by(value) %>% 
  summarise(n = n()) %>% 
  filter(n > 1) %>%
  bind_rows(tibble(value = "Other Countries", n = others)) %>%
  ggplot(aes(x = reorder(value, -n), y = n, fill = "a")) + 
  geom_col() + 
  scale_fill_grey() + 
  theme_bw() +
  theme(legend.position = "none", axis.text.x = element_text(angle = 45, hjust = 1)) + 
  labs(x = "Country", y = "Count", subtitle = "Countries of authors' universities")
ggsave("figs/authors_universities.pdf", height = 4, width = 7)


# Countries of study ----------------------------------------------------

#Find number of countries that showed up only once
others = df %>% 
  rename(countries = `Country of study (where participants are from)`) %>% 
  select(countries) %>% 
  separate(countries, into = c("country1", "country2", "country3",
                               "country4", "country5", "country6",
                               "country7", "country8", "country9",
                               "country10", "country11", "country12", 
                               "country13", "country14", "country15", 
                               "country16", "country17", "country18", 
                               "country19", "country20", "country21"), sep = ";") %>% 
  pivot_longer(cols = c("country1", "country2", "country3",
                        "country4", "country5", "country6",
                        "country7", "country8", "country9",
                        "country10", "country11", "country12", 
                        "country13", "country14", "country15", 
                        "country16", "country17", "country18", 
                        "country19", "country20", "country21")) %>% 
  select(value) %>% 
  filter(complete.cases(value)) %>% 
  mutate(value = value %>% str_remove("\\(but see note\\)") %>% trimws(),
         value = recode(value, "Democratic Republic of Congo" = "Congo",
                        "Democratic Republic of the Congo" = "Congo",
                        "the United States" = "United States",
                        "the Philippines" = "Philippines",
                        "Bosnia and\nHerzegovina" = "Bosnia-Herzegovina",
                        "Bosnia–Herze-\ngovina" = "Bosnia-Herzegovina",
                        "Columbia" = "Colombia",
         ),
         value = str_trim(value, side = "both")) %>% 
  group_by(value) %>% 
  summarise(n = n()) %>% 
  filter(n == 1) %>% 
  ungroup() %>% summarise(n = sum(n)) %>% pull(n)

#Plot number of studies by country. Add a column for other countries from the code above
df %>% 
  rename(countries = `Country of study (where participants are from)`) %>% 
  select(countries) %>% 
  separate(countries, into = c("country1", "country2", "country3",
                               "country4", "country5", "country6",
                               "country7", "country8", "country9",
                               "country10", "country11", "country12", 
                               "country13", "country14", "country15", 
                               "country16", "country17", "country18", 
                               "country19", "country20", "country21"), sep = ";") %>% 
  pivot_longer(cols = c("country1", "country2", "country3",
                        "country4", "country5", "country6",
                        "country7", "country8", "country9",
                        "country10", "country11", "country12", 
                        "country13", "country14", "country15", 
                        "country16", "country17", "country18", 
                        "country19", "country20", "country21")) %>% 
  select(value) %>% 
  filter(complete.cases(value)) %>% 
  #filter(str_detect(value, "Bosnia")) %>% pull(value) %>% unique
  mutate(value = value %>% str_remove("\\(but see note\\)") %>% trimws(),
         value = recode(value, "Democratic Republic of Congo" = "Congo",
                        "Democratic Republic of the Congo" = "Congo",
                        "the United States" = "United States",
                        "the Philippines" = "Philippines",
                        "Bosnia and\nHerzegovina" = "Bosnia-Herzegovina",
                        "Columbia" = "Colombia",
                        "United Kingdom (Northern Ireland)" = "Northern Ireland"
                        ),
         value = str_trim(value, side = "both")) %>% 
  group_by(value) %>% 
  summarise(n = n()) %>% 
  filter(n > 1) %>%
  bind_rows(tibble(value = "Other Countries", n = others)) %>%
  ggplot(aes(x = reorder(value, -n), y = n, fill = "a")) + 
  geom_col() + 
  scale_fill_grey() + 
  theme_bw() +
  theme(legend.position = "none", axis.text.x = element_text(angle = 45, hjust = 1)) + 
  labs(x = "Country", y = "Count", subtitle = "Countries of study")
ggsave("figs/study_country.pdf", height = 4, width = 8)


# Research method over time ----------------------------------------------------

#Note: Some studies use multiple research methods
df_plot = df %>% 
  rename(method = `Research Method`, year = `Publication year`) %>% 
  select(method, year) %>% 
  mutate(year = year %>% unlist,
         year = year %>% as.numeric,
         ) %>%  
  separate(method, into = c("method1", "method2", "method3",
                            "method4", "method5"), sep = ";") %>% 
  pivot_longer(cols = c("method1", "method2", "method3",
                        "method4", "method5")) %>% 
  select(year, value) %>% 
  filter(complete.cases(value),
         complete.cases(year)) %>% 
  mutate(value = str_trim(value, side = "both"),
         value = value %>% str_replace("^other.+", "other"),
         value = recode(value, "interview" = "interviews",
                        "field experiment" = "field experiments",
                        "natural experiment" = "natural experiments",
                        "lab-in-field experiment" = "lab-in-field experiments",
                        "survey" = "surveys",
                        "survey experiment" = "survey experiments",
                        "participatory observation" = "participant observation")) %>% 
  group_by(year, value) %>% 
  summarise(n = n())
df_plot$value %>% table
#Complete dataset with missing year-method combinations
all_years <- seq(1999, 2023)
all_values <- unique(df_plot$value)
expanded_data <- expand.grid(year = all_years, value = all_values)

#Merge the two
df_plot = df_plot %>% 
  right_join(expanded_data) %>% 
  mutate(n = case_when(is.na(n) ~ 0,
                       TRUE ~ n))

df_plot %>% 
  mutate(value = factor(value, levels = c("interviews",
                                          "focus groups",
                                          "participant observation",
                                          "surveys",
                                          "survey experiments",
                                          "field experiments",
                                          "lab-in-field experiments",
                                          "natural experiments",
                                          "public testimony",
                                          "text analysis",
                                          "case-control study",
                                          "other"))) %>%
  ggplot(aes(x = year, y = n)) + 
  geom_line() + 
  scale_color_grey() + 
  theme_bw() +
  facet_wrap(~ value) + 
  labs(x = "Year", y = "Count", subtitle = "Study methods over time")
ggsave("figs/method_time.pdf", height = 4, width = 7)


# Justic Mechanism ----------------------------------------------------
df %>% 
  rename(mechanism = `TJ Mechanisms (MR Simplified Coding)`) %>% 
  select(mechanism) %>% 
  mutate(mechanism = mechanism %>% str_replace_all(",", ";")) %>% 
  separate(mechanism, into = c("mechanism1", "mechanism2", "mechanism3",
                               "mechanism4",  "mechanism5", "mechanism6", 
                               "mechanism7", "mechanism8"), sep = ";") %>% 
  pivot_longer(cols = c("mechanism1", "mechanism2", "mechanism3",
                        "mechanism4",  "mechanism5", "mechanism6", 
                        "mechanism7", "mechanism8")) %>% 
  select(value) %>% 
  filter(complete.cases(value)) %>% 
  mutate(value = str_trim(value, side = "both"),
         value = value %>% str_replace("\\([^()]*\\)", ""),
         value = value %>% str_to_title(),
         value = recode(value, "Ddr" = "DDR"),
         value = str_trim(value, side = "both")
         ) %>% 
  group_by(value) %>% 
  summarise(n = n()) %>% 
  ggplot(aes(x = reorder(value, -n), y = n, fill = "a")) + 
  geom_col() + 
  scale_fill_grey() + 
  theme_bw() +
  theme(legend.position = "none", axis.text.x = element_text(angle = 45, hjust = 1)) + 
  labs(x = "Mechanism", y = "Count", subtitle = "Transitional justice mechanism")
ggsave("figs/justice_mechanism.pdf", height = 4, width = 7)


# Authors fields ----------------------------------------------------------

df$Field %>% unique


df %>% 
  rename(field = `Field`) %>% 
  select(field) %>% 
  mutate(field = field %>% str_replace_all(",", ";")) %>% 
  separate(field, into = c("field1", "field2", "field3",
                           "field4",  "field5", "field6", 
                           "field7", "field8"), sep = ";") %>% 
  pivot_longer(cols = c("field1", "field2", "field3",
                        "field4",  "field5", "field6", 
                        "field7", "field8")) %>% 
  select(value) %>% 
  filter(complete.cases(value)) %>% 
  mutate(value = str_trim(value, side = "both"),
         value = value %>% str_replace("\\([^()]*\\)", ""),
         value = value %>% str_to_title(),
         value = str_replace_all(value, "And", "and")) %>% 
  group_by(value) %>% 
  summarise(n = n()) %>% 
  filter(n > 2) %>% 
  ggplot(aes(x = reorder(value, -n), y = n, fill = "a")) + 
  geom_col() + 
  scale_fill_grey() + 
  theme_bw() +
  theme(legend.position = "none", axis.text.x = element_text(angle = 45, hjust = 1)) + 
  labs(x = "Field", y = "Count", subtitle = "Research field")
ggsave("figs/field.pdf", height = 4, width = 5)


